The selected dataset comes from Kaggle.
The dataset contains simulated credit card transactions with a column identifying fraud. It covers credit cards of 1000 customers doing transactions with a pool of 800 merchants.
Credit card fraud impacts both the customer (through stress, time spent resolving, limited access to expected funds) and the credit card company (through customer retention and reimbursement of customers for lost transactions).
The goal is to apply the machine learning and data mining techniques to predict, and therefore prevent, fraudulent credit card transactions.
In this part, I have conducted data preprocessing, EDA, data transformation, Feature Engineering, etc. After all these steps, logistic regression model and different tree-based models are built. Each model's performance will be compared.
# main libraries
import pandas as pd
import numpy as np
import time
from numpy import mean, where
from collections import Counter
import statsmodels.api as sm
# visual libraries
import matplotlib.pyplot
import matplotlib.pyplot as plt
import seaborn as sns
# imblearn libraries
import imblearn
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# sklearn libraries
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score,RepeatedStratifiedKFold
from sklearn.linear_model import ElasticNet, LogisticRegression
from sklearn.datasets import make_classification
from sklearn import tree, metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report, roc_auc_score
from sklearn.metrics import make_scorer, roc_curve, roc_auc_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier
fraudTrain = pd.read_csv('fraudTrain.csv')
fraudTest = pd.read_csv('fraudTest.csv')
# Joining fraudTrain and fraudTest datasets
fraud_df = fraudTrain.append(fraudTest)
#fraud_df.head()
print(fraud_df.shape)
fraud_df.head()
fraud_df.isnull().any().sum()
fraud_df.dtypes
fraud_df.nunique()
fraud_df['trans_datetime'] = pd.to_datetime(fraud_df['trans_date_trans_time'], format='%Y-%m-%d %H:%M:%S')
fraud_df['trans_date'] = fraud_df['trans_datetime'].dt.date
# Converting dob to age
## To parse all datetime columns in advance
fraud_df['dob'] = pd.to_datetime(fraud_df['dob'], errors='coerce')
now = pd.to_datetime('now')
## Create a new column age
fraud_df['dob']=(now.year - fraud_df['dob'].dt.year) - ((now.month - fraud_df['dob'].dt.month) < 0)
fraud_df = fraud_df.rename(columns={'dob': 'age'})
fraud_df.head(3)
trans_count_by_date = fraud_df['trans_date'].value_counts().sort_index().reset_index()
trans_count_by_date.columns = ['trans_date','count']
plt = trans_count_by_date.plot.line(x='trans_date', y='count', figsize=(14, 8), title= 'Transaction Volume by Date')
plt.set_xlabel('Transaction Date')
plt.set_ylabel('Volume')
plt = pd.value_counts(fraud_df['merchant']).nlargest(10).plot.bar(figsize=(14, 8), color='orange', title= 'Volume by Merchant')
plt.set_xlabel('Merchent')
plt.set_ylabel('Volume')
fig = matplotlib.pyplot.subplots(figsize=(14, 8))
category = sns.countplot(y='category', data=fraud_df, order= fraud_df['category'].value_counts().index)
category.set_title('Volume by Category')
category.set_xlabel('Volume')
category.set_ylabel('Category')
cs = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral','orange','violet',
'dodgerblue', 'lightblue','lightsteelblue','silver', 'lavender','goldenrod',
'cadetblue','orchid']
category_amt = fraud_df.groupby('category')['amt'].sum()
category_amt.plot.pie(autopct="%.1f%%", colors=cs, figsize=(8, 8), title= 'Amount by Category')
plt=fraud_df.head(100).boxplot(column='amt', figsize=(14, 8),color= 'purple')
plt.set_title('Boxplot of Amount')
fig = matplotlib.pyplot.subplots(figsize=(14, 8))
gender_amt = pd.DataFrame(fraud_df.head(100), columns = ['amt', 'gender'])
ga=sns.boxplot(y='amt', x='gender', data=gender_amt, hue='gender', dodge=False, width = 0.6, palette= 'Set2')
ga.set_title('Amount by Gender')
fig = matplotlib.pyplot.subplots(figsize=(14, 8))
sns.countplot(x= 'age',data=fraud_df, color = 'lightskyblue')
fraud_df = fraud_df.drop(columns=['Unnamed: 0','trans_date_trans_time', 'cc_num', 'merchant','first', 'last',
'street', 'city', 'state', 'zip', 'lat', 'long','job','trans_num', 'unix_time', 'merch_lat', 'merch_long','trans_datetime','trans_date'])
fraud_df.head(10)
print(fraud_df.shape)
X_cat = fraud_df.copy()
X_cat = fraud_df.select_dtypes(include=['object'])
X_enc = X_cat.copy()
print(X_enc)
# OneHotEncoding
X_enc = pd.get_dummies(X_enc, columns=['category','gender'])
fraud_df = fraud_df.drop(['category','gender'],axis=1)
data = pd.concat([fraud_df,X_enc], axis=1)
data.head()
data.shape
In the column 'is_fraud', data are Labeled as 0's and 1's. 0 = non fraud, 1 = fraud
All = data.shape[0]
fraud = data[data['is_fraud'] == 1]
nonFraud = data[data['is_fraud'] == 0]
a = len(fraud)/All
b = len(nonFraud)/All
print('frauds :',round(a,2)*100,'%')
print('non frauds :',round(b,2)*100,'%')
features = ['amt', 'city_pop', 'age', 'category_entertainment',
'category_food_dining', 'category_gas_transport',
'category_grocery_net', 'category_grocery_pos',
'category_health_fitness', 'category_home', 'category_kids_pets',
'category_misc_net', 'category_misc_pos', 'category_personal_care',
'category_shopping_net', 'category_shopping_pos', 'category_travel',
'gender_F', 'gender_M']
target = ['is_fraud']
# Separating out the features
x = data.loc[:, features].values
# Separating out the target
y = data.loc[:,target].values
# Standardizing the features
x = StandardScaler().fit_transform(x)
pca = PCA()
pca.fit_transform(x)
pca_variance = pca.explained_variance_
plt.figure(figsize=(14, 8))
plt.bar(range(19), pca_variance, alpha=0.5, align='center', label='individual variance')
plt.legend()
plt.ylabel('Variance ratio')
plt.xlabel('Principal components')
plt.show()
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principal_df = pd.DataFrame(data = principalComponents,
columns = ['principal component 1', 'principal component 2'])
principal_df
fraud = pd.DataFrame(data=y, columns = ['is_fraud'])
final_df = pd.concat([principal_df, fraud[['is_fraud']]], axis = 1)
final_df
principal_df.shape
fig = plt.figure(figsize = (14,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [1,0]
colors = ['r','b']
for target, color in zip(targets,colors):
indicesToKeep = final_df['is_fraud'] == target
ax.scatter(final_df.loc[indicesToKeep, 'principal component 1']
, final_df.loc[indicesToKeep, 'principal component 2']
, c = color
, s = 50)
ax.legend(targets)
ax.grid()
count = len(fraud_df)
train = data[:count]
test = data[count:]
x = train.drop(['is_fraud'], axis=1)
y = train['is_fraud'].astype('int')
test = test.drop(['is_fraud'], axis=1)
x_train, x_test, y_train,y_test = train_test_split(x, y, test_size=0.3,random_state=123)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
Resampling methods are designed to add or remove samples from the training dataset in order to change the class distribution. Once the class distributions are more balanced, the suite of standard machine learning classification algorithms can be fit successfully on the transformed datasets.
Oversampling methods duplicate or create new synthetic examples in the minority class, whereas undersampling methods delete or merge examples in the majority class.
Here, I use oversampling to duplicate or create new synthetic examples in the minority class which is when is_fraud = 1.
SMOTE should only be done in the training data, but test on the original testing data set since the latter reflects the real-world distribution of majority and minority class samples. That's why I apply SMOTE after data splitting.
# Oversample with SMOTE and random undersample for imbalanced dataset
# Define dataset
x_train1, y_train1 = make_classification(n_samples=100000, n_features=19, n_redundant=2, n_clusters_per_class=2,
weights=[0.99], flip_y=0, random_state=1)
# Summarize class distribution
counter = Counter(y_train1)
print(counter)
# Define pipeline
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# Transform the dataset
x_train1, y_train1 = pipeline.fit_resample(x_train1, y_train1)
# Summarize the new class distribution
counter = Counter(y_train1)
print(counter)
# Scatter plot of examples by class label
plt.figure(figsize=(14, 8))
for label, _ in counter.items():
row_ix = where(y_train1 == label)[0]
plt.scatter(x_train1[row_ix, 0], x_train1[row_ix, 1], label=str(label))
plt.legend()
plt.show()
corrMatrix = data.corr()
#print (corrMatrix)
plt.figure(figsize=(14, 14))
#cmap = sns.diverging_palette(250, 30, l=65, as_cmap=True)
cmap = sns.diverging_palette(150, 275, s=80, l=60, n=11)
sns.heatmap(corrMatrix, annot=True, vmin=-1, vmax=1, center= 0, cmap=cmap)
plt.show()
LR=LogisticRegression(penalty='l2', dual=False, tol=0.2, C=1.0, fit_intercept=True, intercept_scaling=1,
class_weight='balanced', random_state=24, solver='sag', max_iter=100, multi_class='auto',
warm_start=True, n_jobs=2)
LR.fit(x_train, y_train)
LR.score(x_train, y_train)
LR_pred = LR.predict(x_test)
# Accuracy Score on test dataset
LR_accuracy = metrics.accuracy_score(y_test, LR_pred)
print("Accuracy:", round(LR_accuracy,2)*100,'%')
LR_probs = LR.predict_proba(x_test)[:, 1]
# Calculate roc auc
LR_roc_value = roc_auc_score(y_test, LR_probs)
print('\nROC AUC : ', round(LR_roc_value,2))
logit_model=sm.Logit(y_train1,x_train1)
result=logit_model.fit()
print(result.summary2())
# confusion matrix
LRmatrix = confusion_matrix(y_test, LR_pred)
print('Confusion matrix : \n',LRmatrix)
# classification report for precision, recall f1-score and accuracy
LRmatrix1 = classification_report(y_test,LR_pred)
print('Classification report : \n',LRmatrix1)
sns.heatmap(LRmatrix, annot=True, fmt=".0f", vmin=0, vmax=555719, center= 0, cmap='coolwarm')
plt.show()
# Train Decision Tree Model
DT = tree.DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=3, max_features = 'auto',
random_state=24)
DT.fit(x_train1, y_train1)
DT.score(x_train1, y_train1)
# Predict Output
DT_pred = DT.predict(x_test1)
# Accuracy Score on test dataset
DT_accuracy = metrics.accuracy_score(y_test1, DT_pred)
print("Accuracy:", round(DT_accuracy,2)*100,'%')
DT_probs = DT.predict_proba(x_test1)[:, 1]
# Calculate roc auc
DT_roc_value = roc_auc_score(y_test1, DT_probs)
print('\nROC AUC : ', round(DT_roc_value,2))
# confusion matrix
DTmatrix = confusion_matrix(y_test1, DT_pred)
print('Confusion matrix : \n',DTmatrix)
# classification report for precision, recall f1-score and accuracy
DTmatrix1 = classification_report(y_test1,DT_pred)
print('Classification report : \n',DTmatrix1)
cmap = sns.diverging_palette(150, 275, s=80, l=60, n=10)
sns.heatmap(DTmatrix, annot=True, fmt=".0f", vmin=0, vmax=555719, center= 0, cmap=cmap)
plt.show()
# Extract feature importances
DT_Feature_Imp = pd.DataFrame({'features': list(features),
'importance': DT.feature_importances_}).\
sort_values('importance', ascending = False)
DT_Feature_Imp.head()
fig = plt.figure(figsize=(16,10), dpi=900)
treeplot = tree.plot_tree(DT,
feature_names=features,
class_names=['0','1'],
filled=True)
# Train Random Forest Model
RF = RandomForestClassifier(n_estimators=800, bootstrap = True, oob_score = True, n_jobs=1, criterion= 'gini',
max_depth=3, max_features = 'auto', random_state= 24)
RF.fit(x_train1, y_train1)
# number of trees used
print('Number of Trees used : ', RF.n_estimators)
# Predict Output
RF_pred = RF.predict(x_test1)
# Accuracy Score on test dataset
RF_accuracy = accuracy_score(y_test1,RF_pred)
print('Accuracy: %.2f%%' % (RF_accuracy * 100.0))
RF_probs = RF.predict_proba(x_test1)[:, 1]
# Calculate roc auc
RF_roc_value = roc_auc_score(y_test1, RF_probs)
print('\nROC AUC : ', round(RF_roc_value,2))
# confusion matrix
RFmatrix = confusion_matrix(y_test1, RF_pred)
print('Confusion matrix : \n',RFmatrix)
# classification report for precision, recall f1-score and accuracy
RFmatrix1 = classification_report(y_test1,RF_pred)
print('Classification report : \n',RFmatrix1)
cmap = sns.diverging_palette(250, 30, s=80, l=60, n=20)
sns.heatmap(RFmatrix, annot=True,fmt=".0f", vmin=0, vmax=555719, center= 0, cmap=cmap)
plt.show()
# Extract feature importances
RF_Feature_Imp = pd.DataFrame({'features': list(features),
'importance': RF.feature_importances_}).\
sort_values('importance', ascending = False)
RF_Feature_Imp.head()
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (16,10), dpi=800)
tree.plot_tree(RF.estimators_[50],
feature_names = features,
class_names=['0','1'],
filled = True);
fig, axes = plt.subplots(nrows = 1,ncols = 3,figsize = (16,4), dpi=900)
for index in range(0, 3):
tree.plot_tree(RF.estimators_[index],
feature_names = features,
class_names=['0','1'],
filled = True,
ax = axes[index]);
axes[index].set_title('Estimator: ' + str(index), fontsize = 11)
GBM = GradientBoostingClassifier(n_estimators=800, max_depth=4, loss='exponential', learning_rate=0.5,
criterion='mse', min_weight_fraction_leaf=0.25, min_impurity_decrease=0.25,
random_state=20, max_features='log2', warm_start=True)
GBM.fit(x_train1,y_train1)
# number of trees used
print('Number of Trees used : ', GBM.n_estimators)
# Predict Output
GBM_pred = GBM.predict(x_test1)
# Accuracy Score on test dataset
GBM_accuracy = accuracy_score(y_test1,GBM_pred)
print('Accuracy: %.2f%%' % (GBM_accuracy * 100.0))
GBM_probs = GBM.predict_proba(x_test1)[:, 1]
# Calculate roc auc
GBM_roc_value = roc_auc_score(y_test1, GBM_probs)
print('\nROC AUC : ', round(GBM_roc_value,2))
# confusion matrix
GBMmatrix = confusion_matrix(y_test1, GBM_pred)
print('Confusion matrix : \n',GBMmatrix)
# classification report for precision, recall f1-score and accuracy
GBMmatrix1 = classification_report(y_test1,GBM_pred)
print('Classification report : \n',GBMmatrix1)
cmap = sns.diverging_palette(180, 150, s=80, l=60, n=20)
sns.heatmap(GBMmatrix, annot=True, fmt=".0f", vmin=0, vmax=555719, center= 0, cmap=cmap)
plt.show()
# Extract feature importances
GBM_Feature_Imp = pd.DataFrame({'features': list(features),
'importance': GBM.feature_importances_}).\
sort_values('importance', ascending = False)
GBM_Feature_Imp.head()
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (6,4), dpi=800)
tree.plot_tree(GBM.estimators_[30,0], # Get the tree number 30
feature_names = features,
class_names=['0','1'],
filled = True);
XGB = XGBClassifier(n_estimators = 50, objective='binary:logistic', booster='gbtree',
colsample_bytree = 0.3, learning_rate = 0.1,max_depth = 5, alpha = 10)
XGB.fit(x_train,y_train)
# Predict Output
XGB_pred = XGB.predict(x_test)
# Accuracy Score on test dataset
XGB_accuracy = accuracy_score(y_test,XGB_pred)
print("Accuracy: %.2f%%" % (XGB_accuracy * 100.0))
XGB_probs = XGB.predict_proba(x_test)[:, 1]
# Calculate roc auc
XGB_roc_value = roc_auc_score(y_test, XGB_probs)
print('\nROC AUC : ', round(XGB_roc_value,2))
# confusion matrix
XGBmatrix = confusion_matrix(y_test, XGB_pred)
print('Confusion matrix : \n',XGBmatrix)
# classification report for precision, recall f1-score and accuracy
XGBmatrix1 = classification_report(y_test,XGB_pred)
print('Classification report : \n',GBMmatrix1)
cmap = sns.diverging_palette(180, 150, s=80, l=60, n=20)
sns.heatmap(XGBmatrix, annot=True, fmt=".0f", vmin=0, vmax=555719, center= 0, cmap=cmap)
plt.show()
# Extract feature importances
XGB_Feature_Imp = pd.DataFrame({'features': list(features),
'importance': XGB.feature_importances_}).\
sort_values('importance', ascending = False)
XGB_Feature_Imp.head()
According to the accuracies and AUC score, XGB has the best performance.